Note: It's a competition from Kaggle.com and the input data was retrieved from there.
It is your job to predict the sales price for each house. For each Id in the test set, you must predict the value of the SalePrice variable.
Submissions are evaluated on Root-Mean-Squared-Error (RMSE) between the logarithm of the predicted value and the logarithm of the observed sales price. (Taking logs means that errors in predicting expensive houses and cheap houses will affect the result equally.)
The file should contain a header and have the following format:
Id,SalePrice 1461,169000.1 1462,187724.1233 1463,175221 etc.
In [1]:
import numpy as np
import pandas as pd
#load the files
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
data = pd.concat([train, test])
#size of training dataset
train_samples = train.shape[0]
#print some of them
data.head()
Out[1]:
In [2]:
# remove the Id feature
data.drop(['Id'],1, inplace=True);
In [3]:
data.info()
In [4]:
print("Size training: {}".format(train.shape[0]))
print("Size testing: {}".format(test.shape[0]))
In [5]:
datanum = data.select_dtypes([np.number])
datanum.describe()
Out[5]:
In [6]:
data.select_dtypes(exclude=[np.number]).head()
Out[6]:
In [7]:
datanum.columns[datanum.isnull().any()].tolist()
Out[7]:
In [8]:
#number of row without NaN
print(datanum.shape[0] - datanum.dropna().shape[0])
In [9]:
#list of columns with NaN
datanum.columns[datanum.isnull().any()].tolist()
Out[9]:
In [10]:
#Filling with the mean
datanum_no_nan = datanum.fillna(datanum.mean())
#check
datanum_no_nan.columns[datanum_no_nan.isnull().any()].tolist()
Out[10]:
In [11]:
import matplotlib.pyplot as plt
datanum_no_nan.drop(['SalePrice'], axis=1).head(15).plot()
plt.show()
In [12]:
#Squeeze the data to [0,1]
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
columns = datanum_no_nan.columns
columns = columns.drop('SalePrice')
print("Features: {}".format(columns))
data_norm = datanum_no_nan
In [13]:
data_norm[columns] = scaler.fit_transform(datanum_no_nan[columns])
print("Train shape: {}".format(data_norm.shape))
data_norm.drop(['SalePrice'], axis=1).head(15).plot()
plt.show()
In [14]:
data_norm.describe().T
Out[14]:
In [15]:
#plotting distributions of numeric features
data_norm.hist(bins=50, figsize=(22,16))
plt.show()
In [16]:
data_norm['1stFlrSF'].hist()
plt.show()
In [17]:
#transform the data so it's closest to normal
from scipy import stats
data_gauss = data_norm.copy()
for f in datanum.columns.tolist():
data_gauss[f], _ = stats.boxcox(data_gauss[f]+0.01)
#rescale again
std_scaler = preprocessing.StandardScaler()
data_gauss[columns] = std_scaler.fit_transform(data_gauss[columns])
data_gauss['1stFlrSF'].hist()
plt.show()
In [18]:
#plotting distributions of numeric features
data_gauss.hist(bins=50, figsize=(22,16))
plt.show()
In [19]:
#include no numbers columns
data.select_dtypes(exclude=[np.number]).head()
data_categorical = pd.get_dummies(data.select_dtypes(exclude=[np.number]))
data_all = pd.concat([data_norm, data_categorical], axis=1)
In [20]:
#data_norm.columns.tolist()
feat_list = ['1stFlrSF',
#'2ndFlrSF',
#'3SsnPorch',
'BedroomAbvGr',
'BsmtFinSF1',
#'BsmtFinSF2',
#'BsmtFullBath',
#'BsmtHalfBath',
'BsmtUnfSF',
#'EnclosedPorch',
#'Fireplaces',
#'FullBath',
'GarageArea',
'GarageCars',
'GarageYrBlt',
#'GrLivArea',
#'HalfBath',
#'KitchenAbvGr',
'LotArea',
'LotFrontage',
#'LowQualFinSF',
'MSSubClass',
'MasVnrArea',
#'MiscVal',
'MoSold',
'OpenPorchSF',
'OverallCond',
'OverallQual',
'PoolArea',
#'SalePrice',
#'ScreenPorch',
'TotRmsAbvGrd',
'TotalBsmtSF',
'WoodDeckSF',
'YearBuilt',
'YearRemodAdd']
#'YrSold']
In [21]:
%matplotlib inline
import seaborn as sns
fig = plt.figure(figsize=(14, 10))
sns.heatmap(data_norm[feat_list+['SalePrice']].corr())
Out[21]:
In [22]:
#heatmap
fig = plt.figure(figsize=(14, 10))
sns.heatmap(data_norm.corr())
Out[22]:
In [23]:
# Correlation features
data_norm.corr()['SalePrice'].sort_values().tail(13)
Out[23]:
In [24]:
feat_low_corr = ['KitchenAbvGr',
'EnclosedPorch',
'MSSubClass',
'OverallCond',
'YrSold',
'LowQualFinSF',
'MiscVal',
'BsmtHalfBath',
'BsmtFinSF2',
'MoSold',
'3SsnPorch',
'PoolArea',
'ScreenPorch']
feat_high_corr = ['Fireplaces',
'MasVnrArea',
'YearRemodAdd',
'YearBuilt',
'TotRmsAbvGrd',
'FullBath',
'1stFlrSF',
'TotalBsmtSF',
'GarageArea',
'GarageCars',
'GrLivArea',
'OverallQual']
data_norm_low_corr = data_norm[feat_low_corr]
data_norm_high_corr = data_norm[feat_high_corr]
In [152]:
from sklearn.model_selection import KFold
y = np.array(data_all['SalePrice'])
X = np.array(data_norm_high_corr)
#split by idx
idx = train_samples
X_train, X_test = X[:idx], X[idx:]
y_train, y_test = y[:idx], y[idx:]
print("Shape X train: {}".format(X_train.shape))
print("Shape y train: {}".format(y_train.shape))
print("Shape X test: {}".format(X_test.shape))
print("Shape y test: {}".format(y_test.shape))
kf = KFold(n_splits=3, random_state=9, shuffle=True)
print(kf)
In [153]:
#plotting PCA
from sklearn.decomposition import PCA
def plotPCA(X, y):
pca = PCA(n_components=1)
X_r = pca.fit(X).transform(X)
plt.plot(X_r, y, 'x')
In [154]:
from sklearn.covariance import EllipticEnvelope
# fit the model
ee = EllipticEnvelope(contamination=0.05,
assume_centered=True,
random_state=9)
ee.fit(X_train)
pred = ee.predict(X_train)
X_train = X_train[pred == 1]
y_train = y_train[pred == 1]
print(X_train.shape)
print(y_train.shape)
#after removing anomalies
plotPCA(X_train, y_train)
In [155]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
rf = MLPRegressor(activation='relu',
solver='lbfgs',
#learning_rate_init=1e-2,
#learning_rate='adaptive',
#alpha=0.0001,
max_iter=400,
#shuffle=True,
hidden_layer_sizes=(64,64),
warm_start=True,
random_state=9,
verbose=False)
for e in range(1):
batch = 1;
for train_idx, val_idx in kf.split(X_train, y_train):
X_t, X_v = X_train[train_idx], X_train[val_idx]
y_t, y_v = y_train[train_idx], y_train[val_idx]
#training
rf.fit(X_t, y_t)
#calculate costs
t_error = mean_squared_error(y_t, rf.predict(X_t))**0.5
v_error = mean_squared_error(y_v, rf.predict(X_v))**0.5
print("{}-{}) Training error: {:.2f} Validation error: {:.2f}".format(e, batch, t_error, v_error))
batch += 1
#Scores
print("Training score: {:.4f}".format(rf.score(X_train, y_train)))
In [181]:
# Gradient boosting
from sklearn import ensemble
params = {'n_estimators': 100, 'max_depth': 50, 'min_samples_split': 5,
'learning_rate': 0.1, 'loss': 'ls', 'random_state':9, 'warm_start':True}
gbr = ensemble.GradientBoostingRegressor(**params)
batch = 0
for train_idx, val_idx in kf.split(X_train, y_train):
X_t, X_v = X_train[train_idx], X_train[val_idx]
y_t, y_v = y_train[train_idx], y_train[val_idx]
#training
gbr.fit(X_t, y_t)
#calculate costs
t_error = mean_squared_error(y_t, gbr.predict(X_t))**0.5
v_error = mean_squared_error(y_v, gbr.predict(X_v))**0.5
print("{}) Training error: {:.2f} Validation error: {:.2f}".format(batch, t_error, v_error))
batch += 1
#Scores
print("Training score: {:.4f}".format(gbr.score(X_train, y_train)))
In [157]:
# AdaBoost
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
abr = AdaBoostRegressor(DecisionTreeRegressor(max_depth=50),
n_estimators=100, random_state=9)
batch = 0
for train_idx, val_idx in kf.split(X_train, y_train):
X_t, X_v = X_train[train_idx], X_train[val_idx]
y_t, y_v = y_train[train_idx], y_train[val_idx]
#training
abr.fit(X_t, y_t)
#calculate costs
t_error = mean_squared_error(y_t, abr.predict(X_t))**0.5
v_error = mean_squared_error(y_v, abr.predict(X_v))**0.5
print("{}) Training error: {:.2f} Validation error: {:.2f}".format(batch, t_error, v_error))
batch += 1
#Scores
print("Training score: {:.4f}".format(abr.score(X_train, y_train)))
In [158]:
# Lasso
from sklearn.linear_model import Lasso
lr = Lasso()
batch = 0
for train_idx, val_idx in kf.split(X_train, y_train):
X_t, X_v = X_train[train_idx], X_train[val_idx]
y_t, y_v = y_train[train_idx], y_train[val_idx]
#training
lr.fit(X_t, y_t)
#calculate costs
t_error = mean_squared_error(y_t, lr.predict(X_t))**0.5
v_error = mean_squared_error(y_v, lr.predict(X_v))**0.5
print("{}) Training error: {:.2f} Validation error: {:.2f}".format(batch, t_error, v_error))
batch += 1
#Scores
print("Training score: {:.4f}".format(lr.score(X_train, y_train)))
In [178]:
### Testing
### Ada + mlp + gradient boosting -> level 1 predictions
### level 1 -> mlp -> level 2 predictions (final)
# Training
#mlp1 = MLPRegressor(activation='logistic',
# solver='sgd',
# hidden_layer_sizes=(5,5),
# learning_rate='adaptive',
# random_state=9,
# warm_start=True,
# verbose=False)
from sklearn.linear_model import LogisticRegression
mlp = LogisticRegression(random_state=9)
sclr = preprocessing.StandardScaler()
def stack_training(X, y):
X0 = rf.predict(X)
X1 = gbr.predict(X)
X2 = abr.predict(X)
X3 = lr.predict(X)
Xt = np.array([X0, X1, X2, X3]).T
#Xt = np.array([X0, X1, X2, X3, X1+X3, X2*X3, X0*X2*X3, X0/X2, X1/X3, X0/X3, (X0+X1+X2+X3)/4]).T
Xt = sclr.fit_transform(Xt)
mlp.fit(Xt, y)
def stack_predict(X, verbose=False):
X0 = rf.predict(X)
X1 = gbr.predict(X)
X2 = abr.predict(X)
X3 = lr.predict(X)
Xt = np.array([X0, X1, X2, X3]).T
#Xt = np.array([X0, X1, X2, X3, X1+X3, X2*X3, X0*X2*X3, X0/X2, X1/X3, X0/X3, (X0+X1+X2+X3)/4]).T
Xt = sclr.transform(Xt)
if verbose:
print("Training score: {:.4f}".format(mlp.score(Xt, y_train)))
plotPCA(Xt, y_train)
return mlp.predict(Xt)
#
batch = 0
kf = KFold(n_splits=10, random_state=9, shuffle=True)
for train_idx, val_idx in kf.split(X_train, y_train):
X_t, X_v = X_train[train_idx], X_train[val_idx]
y_t, y_v = y_train[train_idx], y_train[val_idx]
#training
stack_training(X_t, y_t)
#calculate costs
t_error = mean_squared_error(y_t, abr.predict(X_t))**0.5
v_error = mean_squared_error(y_v, abr.predict(X_v))**0.5
print("{}) Training error: {:.2f} Validation error: {:.2f}".format(batch, t_error, v_error))
batch += 1
rmse = mean_squared_error(y_train, stack_predict(X_train, True))**0.5
print("RMSE: {:.4f}".format(rmse))
In [177]:
from sklearn.metrics import mean_squared_error
import random
RMSE_rf = mean_squared_error(y_train, rf.predict(X_train))**0.5
RMSE_gbr = mean_squared_error(y_train, gbr.predict(X_train))**0.5
RMSE_abr = mean_squared_error(y_train, abr.predict(X_train))**0.5
RMSE_lr = mean_squared_error(y_train, lr.predict(X_train))**0.5
RMSE_stack = mean_squared_error(y_train, stack_predict(X_train))**0.5
def avg_predict(X):
return (rf.predict(X) + gbr.predict(X) + abr.predict(X) + lr.predict(X))/4
predictions = avg_predict(X_train)
RMSE_total = mean_squared_error(y_train, predictions)**0.5
print("RMSE mlp: {:.3f}".format(RMSE_rf))
print("RMSE gbr: {:.3f}".format(RMSE_gbr))
print("RMSE abr: {:.3f}".format(RMSE_abr))
print("RMSE lr: {:.3f}".format(RMSE_lr))
print("====")
print("RMSE average: {:.3f}".format(RMSE_total))
print("RMSE stacked: {:.3f}".format(RMSE_stack))
In [33]:
import os
#predict = avg_predict(X_test)
predict = stack_predict(X_test)
file = "Id,SalePrice" + os.linesep
startId = 1461
for i in range(len(X_test)):
file += "{},{}".format(startId, (int)(predict[i])) + os.linesep
startId += 1
#print(file)
In [34]:
# Save to file
with open('attempt.txt', 'w') as f:
f.write(file)